{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Predicting Terrorist Attacks\n",
    "## Data Preprocessing\n",
    "\n",
    "**Author:** Thomas Skowronek\n",
    "\n",
    "**Date:** March 20, 2018"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Notebook Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Configure notebook output\n",
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = \"all\"\n",
    "\n",
    "# Number of rows and columns\n",
    "pd.set_option('display.max_rows', 150)\n",
    "pd.set_option('display.max_columns', 150)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the Datasets\n",
    "For this project, the two most recents dataset are imported.  The first covers the years 1995 to 2012, and the second spans 2013 to 2016."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Load 1995-2012 GTD\n",
    "gtd_df1 = pd.read_csv('../data/gtd_95to12_0617dist.csv', low_memory=False, index_col = 0,\n",
    "                      na_values=[''])\n",
    "\n",
    "# Load 2013-2016 GTD\n",
    "gtd_df2 = pd.read_csv('../data/gtd_13to16_0617dist.csv', low_memory=False, index_col = 0, \n",
    "                      na_values=[''])\n",
    "\n",
    "# Append the 2nd data frame to the first\n",
    "gtd_df = gtd_df1.append(gtd_df2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Inspect the Structure\n",
    "The data frame contains 135 attributes, one of which is used for the data frame index, and 112,251 observations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 112251 entries, 199501000001 to 201701270001\n",
      "Data columns (total 134 columns):\n",
      "iyear                 int64\n",
      "imonth                int64\n",
      "iday                  int64\n",
      "approxdate            object\n",
      "extended              int64\n",
      "resolution            object\n",
      "country               int64\n",
      "country_txt           object\n",
      "region                int64\n",
      "region_txt            object\n",
      "provstate             object\n",
      "city                  object\n",
      "latitude              float64\n",
      "longitude             float64\n",
      "specificity           float64\n",
      "vicinity              int64\n",
      "location              object\n",
      "summary               object\n",
      "crit1                 int64\n",
      "crit2                 int64\n",
      "crit3                 int64\n",
      "doubtterr             int64\n",
      "alternative           float64\n",
      "alternative_txt       object\n",
      "multiple              int64\n",
      "success               int64\n",
      "suicide               int64\n",
      "attacktype1           int64\n",
      "attacktype1_txt       object\n",
      "attacktype2           float64\n",
      "attacktype2_txt       object\n",
      "attacktype3           float64\n",
      "attacktype3_txt       object\n",
      "targtype1             int64\n",
      "targtype1_txt         object\n",
      "targsubtype1          float64\n",
      "targsubtype1_txt      object\n",
      "corp1                 object\n",
      "target1               object\n",
      "natlty1               float64\n",
      "natlty1_txt           object\n",
      "targtype2             float64\n",
      "targtype2_txt         object\n",
      "targsubtype2          float64\n",
      "targsubtype2_txt      object\n",
      "corp2                 object\n",
      "target2               object\n",
      "natlty2               float64\n",
      "natlty2_txt           object\n",
      "targtype3             float64\n",
      "targtype3_txt         object\n",
      "targsubtype3          float64\n",
      "targsubtype3_txt      object\n",
      "corp3                 object\n",
      "target3               object\n",
      "natlty3               float64\n",
      "natlty3_txt           object\n",
      "gname                 object\n",
      "gsubname              object\n",
      "gname2                object\n",
      "gsubname2             object\n",
      "gname3                object\n",
      "gsubname3             object\n",
      "motive                object\n",
      "guncertain1           float64\n",
      "guncertain2           float64\n",
      "guncertain3           float64\n",
      "individual            int64\n",
      "nperps                float64\n",
      "nperpcap              float64\n",
      "claimed               float64\n",
      "claimmode             float64\n",
      "claimmode_txt         object\n",
      "claim2                float64\n",
      "claimmode2            float64\n",
      "claimmode2_txt        object\n",
      "claim3                float64\n",
      "claimmode3            float64\n",
      "claimmode3_txt        object\n",
      "compclaim             float64\n",
      "weaptype1             int64\n",
      "weaptype1_txt         object\n",
      "weapsubtype1          float64\n",
      "weapsubtype1_txt      object\n",
      "weaptype2             float64\n",
      "weaptype2_txt         object\n",
      "weapsubtype2          float64\n",
      "weapsubtype2_txt      object\n",
      "weaptype3             float64\n",
      "weaptype3_txt         object\n",
      "weapsubtype3          float64\n",
      "weapsubtype3_txt      object\n",
      "weaptype4             float64\n",
      "weaptype4_txt         object\n",
      "weapsubtype4          float64\n",
      "weapsubtype4_txt      object\n",
      "weapdetail            object\n",
      "nkill                 float64\n",
      "nkillus               float64\n",
      "nkillter              float64\n",
      "nwound                float64\n",
      "nwoundus              float64\n",
      "nwoundte              float64\n",
      "property              int64\n",
      "propextent            float64\n",
      "propextent_txt        object\n",
      "propvalue             float64\n",
      "propcomment           object\n",
      "ishostkid             float64\n",
      "nhostkid              float64\n",
      "nhostkidus            float64\n",
      "nhours                float64\n",
      "ndays                 float64\n",
      "divert                object\n",
      "kidhijcountry         object\n",
      "ransom                float64\n",
      "ransomamt             float64\n",
      "ransomamtus           float64\n",
      "ransompaid            float64\n",
      "ransompaidus          float64\n",
      "ransomnote            object\n",
      "hostkidoutcome        float64\n",
      "hostkidoutcome_txt    object\n",
      "nreleased             float64\n",
      "addnotes              object\n",
      "scite1                object\n",
      "scite2                object\n",
      "scite3                object\n",
      "dbsource              object\n",
      "INT_LOG               int64\n",
      "INT_IDEO              int64\n",
      "INT_MISC              int64\n",
      "INT_ANY               int64\n",
      "related               object\n",
      "dtypes: float64(53), int64(23), object(58)\n",
      "memory usage: 115.6+ MB\n"
     ]
    }
   ],
   "source": [
    "# Display a summary of the data frame\n",
    "gtd_df.info(verbose = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### View Missing Data\n",
    "Calculate the total number of null values and percent for each attribute.  As the results show, many attributes are comprised of missing values of more than 50%."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Count</th>\n",
       "      <th>Percent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>weaptype4_txt</th>\n",
       "      <td>112245</td>\n",
       "      <td>99.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weaptype4</th>\n",
       "      <td>112245</td>\n",
       "      <td>99.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weapsubtype4_txt</th>\n",
       "      <td>112244</td>\n",
       "      <td>99.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weapsubtype4</th>\n",
       "      <td>112244</td>\n",
       "      <td>99.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gsubname3</th>\n",
       "      <td>112238</td>\n",
       "      <td>99.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claimmode3</th>\n",
       "      <td>112139</td>\n",
       "      <td>99.90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claimmode3_txt</th>\n",
       "      <td>112139</td>\n",
       "      <td>99.90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gsubname2</th>\n",
       "      <td>112103</td>\n",
       "      <td>99.87</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>divert</th>\n",
       "      <td>112092</td>\n",
       "      <td>99.86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>guncertain3</th>\n",
       "      <td>111997</td>\n",
       "      <td>99.77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claim3</th>\n",
       "      <td>111996</td>\n",
       "      <td>99.77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gname3</th>\n",
       "      <td>111993</td>\n",
       "      <td>99.77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>attacktype3_txt</th>\n",
       "      <td>111879</td>\n",
       "      <td>99.67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>attacktype3</th>\n",
       "      <td>111879</td>\n",
       "      <td>99.67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ransomnote</th>\n",
       "      <td>111786</td>\n",
       "      <td>99.59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ransompaidus</th>\n",
       "      <td>111766</td>\n",
       "      <td>99.57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ransomamtus</th>\n",
       "      <td>111756</td>\n",
       "      <td>99.56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claimmode2</th>\n",
       "      <td>111737</td>\n",
       "      <td>99.54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claimmode2_txt</th>\n",
       "      <td>111737</td>\n",
       "      <td>99.54</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ransompaid</th>\n",
       "      <td>111676</td>\n",
       "      <td>99.49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ransomamt</th>\n",
       "      <td>111555</td>\n",
       "      <td>99.38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>corp3</th>\n",
       "      <td>111417</td>\n",
       "      <td>99.26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>kidhijcountry</th>\n",
       "      <td>111386</td>\n",
       "      <td>99.23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targsubtype3</th>\n",
       "      <td>111357</td>\n",
       "      <td>99.20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targsubtype3_txt</th>\n",
       "      <td>111357</td>\n",
       "      <td>99.20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>natlty3</th>\n",
       "      <td>111333</td>\n",
       "      <td>99.18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>natlty3_txt</th>\n",
       "      <td>111333</td>\n",
       "      <td>99.18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targtype3</th>\n",
       "      <td>111306</td>\n",
       "      <td>99.16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targtype3_txt</th>\n",
       "      <td>111306</td>\n",
       "      <td>99.16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>target3</th>\n",
       "      <td>111305</td>\n",
       "      <td>99.16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weapsubtype3_txt</th>\n",
       "      <td>111087</td>\n",
       "      <td>98.96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weapsubtype3</th>\n",
       "      <td>111087</td>\n",
       "      <td>98.96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>resolution</th>\n",
       "      <td>111039</td>\n",
       "      <td>98.92</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weaptype3_txt</th>\n",
       "      <td>110989</td>\n",
       "      <td>98.88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weaptype3</th>\n",
       "      <td>110989</td>\n",
       "      <td>98.88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>guncertain2</th>\n",
       "      <td>110707</td>\n",
       "      <td>98.62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claim2</th>\n",
       "      <td>110653</td>\n",
       "      <td>98.58</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gname2</th>\n",
       "      <td>110648</td>\n",
       "      <td>98.57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nhours</th>\n",
       "      <td>109697</td>\n",
       "      <td>97.72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gsubname</th>\n",
       "      <td>109396</td>\n",
       "      <td>97.46</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>compclaim</th>\n",
       "      <td>107549</td>\n",
       "      <td>95.81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>attacktype2</th>\n",
       "      <td>106667</td>\n",
       "      <td>95.03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>attacktype2_txt</th>\n",
       "      <td>106667</td>\n",
       "      <td>95.03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ndays</th>\n",
       "      <td>105728</td>\n",
       "      <td>94.19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>approxdate</th>\n",
       "      <td>104815</td>\n",
       "      <td>93.38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weapsubtype2</th>\n",
       "      <td>104473</td>\n",
       "      <td>93.07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weapsubtype2_txt</th>\n",
       "      <td>104473</td>\n",
       "      <td>93.07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nreleased</th>\n",
       "      <td>103995</td>\n",
       "      <td>92.65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hostkidoutcome</th>\n",
       "      <td>103735</td>\n",
       "      <td>92.41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hostkidoutcome_txt</th>\n",
       "      <td>103735</td>\n",
       "      <td>92.41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weaptype2</th>\n",
       "      <td>103663</td>\n",
       "      <td>92.35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weaptype2_txt</th>\n",
       "      <td>103663</td>\n",
       "      <td>92.35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>corp2</th>\n",
       "      <td>103660</td>\n",
       "      <td>92.35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nhostkidus</th>\n",
       "      <td>103325</td>\n",
       "      <td>92.05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nhostkid</th>\n",
       "      <td>103270</td>\n",
       "      <td>92.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targsubtype2</th>\n",
       "      <td>103106</td>\n",
       "      <td>91.85</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targsubtype2_txt</th>\n",
       "      <td>103106</td>\n",
       "      <td>91.85</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>natlty2</th>\n",
       "      <td>103094</td>\n",
       "      <td>91.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>natlty2_txt</th>\n",
       "      <td>103094</td>\n",
       "      <td>91.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>target2</th>\n",
       "      <td>102894</td>\n",
       "      <td>91.66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targtype2</th>\n",
       "      <td>102849</td>\n",
       "      <td>91.62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targtype2_txt</th>\n",
       "      <td>102849</td>\n",
       "      <td>91.62</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claimmode_txt</th>\n",
       "      <td>96257</td>\n",
       "      <td>85.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claimmode</th>\n",
       "      <td>96257</td>\n",
       "      <td>85.75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>alternative</th>\n",
       "      <td>95119</td>\n",
       "      <td>84.74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>alternative_txt</th>\n",
       "      <td>95119</td>\n",
       "      <td>84.74</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>related</th>\n",
       "      <td>95060</td>\n",
       "      <td>84.69</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ransom</th>\n",
       "      <td>93754</td>\n",
       "      <td>83.52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>addnotes</th>\n",
       "      <td>87812</td>\n",
       "      <td>78.23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>propvalue</th>\n",
       "      <td>86751</td>\n",
       "      <td>77.28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>scite3</th>\n",
       "      <td>73833</td>\n",
       "      <td>65.77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>propextent</th>\n",
       "      <td>68783</td>\n",
       "      <td>61.28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>propextent_txt</th>\n",
       "      <td>68783</td>\n",
       "      <td>61.28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>location</th>\n",
       "      <td>65028</td>\n",
       "      <td>57.93</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>motive</th>\n",
       "      <td>64685</td>\n",
       "      <td>57.63</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>propcomment</th>\n",
       "      <td>60849</td>\n",
       "      <td>54.21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weapdetail</th>\n",
       "      <td>54496</td>\n",
       "      <td>48.55</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>scite2</th>\n",
       "      <td>43776</td>\n",
       "      <td>39.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nperps</th>\n",
       "      <td>21234</td>\n",
       "      <td>18.92</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>corp1</th>\n",
       "      <td>14327</td>\n",
       "      <td>12.76</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nwoundte</th>\n",
       "      <td>13272</td>\n",
       "      <td>11.82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nperpcap</th>\n",
       "      <td>12624</td>\n",
       "      <td>11.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nkillter</th>\n",
       "      <td>11498</td>\n",
       "      <td>10.24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weapsubtype1_txt</th>\n",
       "      <td>10304</td>\n",
       "      <td>9.18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weapsubtype1</th>\n",
       "      <td>10304</td>\n",
       "      <td>9.18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nwoundus</th>\n",
       "      <td>9642</td>\n",
       "      <td>8.59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nkillus</th>\n",
       "      <td>9442</td>\n",
       "      <td>8.41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>scite1</th>\n",
       "      <td>9325</td>\n",
       "      <td>8.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>summary</th>\n",
       "      <td>9263</td>\n",
       "      <td>8.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>claimed</th>\n",
       "      <td>9260</td>\n",
       "      <td>8.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nwound</th>\n",
       "      <td>7556</td>\n",
       "      <td>6.73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targsubtype1</th>\n",
       "      <td>6779</td>\n",
       "      <td>6.04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targsubtype1_txt</th>\n",
       "      <td>6779</td>\n",
       "      <td>6.04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nkill</th>\n",
       "      <td>3776</td>\n",
       "      <td>3.36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>provstate</th>\n",
       "      <td>2598</td>\n",
       "      <td>2.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>longitude</th>\n",
       "      <td>1407</td>\n",
       "      <td>1.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>latitude</th>\n",
       "      <td>1407</td>\n",
       "      <td>1.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>natlty1</th>\n",
       "      <td>1113</td>\n",
       "      <td>0.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>natlty1_txt</th>\n",
       "      <td>1113</td>\n",
       "      <td>0.99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>city</th>\n",
       "      <td>446</td>\n",
       "      <td>0.40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>guncertain1</th>\n",
       "      <td>379</td>\n",
       "      <td>0.34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>target1</th>\n",
       "      <td>238</td>\n",
       "      <td>0.21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>specificity</th>\n",
       "      <td>4</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ishostkid</th>\n",
       "      <td>3</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>INT_MISC</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>INT_IDEO</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>INT_ANY</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>INT_LOG</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>dbsource</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>iyear</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>property</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weaptype1_txt</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>iday</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>extended</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>country_txt</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>region</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>region_txt</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vicinity</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>crit1</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>crit2</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>crit3</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>doubtterr</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>multiple</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>success</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>suicide</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>attacktype1</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>attacktype1_txt</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targtype1</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>targtype1_txt</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gname</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>imonth</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weaptype1</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>individual</th>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     Count  Percent\n",
       "weaptype4_txt       112245    99.99\n",
       "weaptype4           112245    99.99\n",
       "weapsubtype4_txt    112244    99.99\n",
       "weapsubtype4        112244    99.99\n",
       "gsubname3           112238    99.99\n",
       "claimmode3          112139    99.90\n",
       "claimmode3_txt      112139    99.90\n",
       "gsubname2           112103    99.87\n",
       "divert              112092    99.86\n",
       "guncertain3         111997    99.77\n",
       "claim3              111996    99.77\n",
       "gname3              111993    99.77\n",
       "attacktype3_txt     111879    99.67\n",
       "attacktype3         111879    99.67\n",
       "ransomnote          111786    99.59\n",
       "ransompaidus        111766    99.57\n",
       "ransomamtus         111756    99.56\n",
       "claimmode2          111737    99.54\n",
       "claimmode2_txt      111737    99.54\n",
       "ransompaid          111676    99.49\n",
       "ransomamt           111555    99.38\n",
       "corp3               111417    99.26\n",
       "kidhijcountry       111386    99.23\n",
       "targsubtype3        111357    99.20\n",
       "targsubtype3_txt    111357    99.20\n",
       "natlty3             111333    99.18\n",
       "natlty3_txt         111333    99.18\n",
       "targtype3           111306    99.16\n",
       "targtype3_txt       111306    99.16\n",
       "target3             111305    99.16\n",
       "weapsubtype3_txt    111087    98.96\n",
       "weapsubtype3        111087    98.96\n",
       "resolution          111039    98.92\n",
       "weaptype3_txt       110989    98.88\n",
       "weaptype3           110989    98.88\n",
       "guncertain2         110707    98.62\n",
       "claim2              110653    98.58\n",
       "gname2              110648    98.57\n",
       "nhours              109697    97.72\n",
       "gsubname            109396    97.46\n",
       "compclaim           107549    95.81\n",
       "attacktype2         106667    95.03\n",
       "attacktype2_txt     106667    95.03\n",
       "ndays               105728    94.19\n",
       "approxdate          104815    93.38\n",
       "weapsubtype2        104473    93.07\n",
       "weapsubtype2_txt    104473    93.07\n",
       "nreleased           103995    92.65\n",
       "hostkidoutcome      103735    92.41\n",
       "hostkidoutcome_txt  103735    92.41\n",
       "weaptype2           103663    92.35\n",
       "weaptype2_txt       103663    92.35\n",
       "corp2               103660    92.35\n",
       "nhostkidus          103325    92.05\n",
       "nhostkid            103270    92.00\n",
       "targsubtype2        103106    91.85\n",
       "targsubtype2_txt    103106    91.85\n",
       "natlty2             103094    91.84\n",
       "natlty2_txt         103094    91.84\n",
       "target2             102894    91.66\n",
       "targtype2           102849    91.62\n",
       "targtype2_txt       102849    91.62\n",
       "claimmode_txt        96257    85.75\n",
       "claimmode            96257    85.75\n",
       "alternative          95119    84.74\n",
       "alternative_txt      95119    84.74\n",
       "related              95060    84.69\n",
       "ransom               93754    83.52\n",
       "addnotes             87812    78.23\n",
       "propvalue            86751    77.28\n",
       "scite3               73833    65.77\n",
       "propextent           68783    61.28\n",
       "propextent_txt       68783    61.28\n",
       "location             65028    57.93\n",
       "motive               64685    57.63\n",
       "propcomment          60849    54.21\n",
       "weapdetail           54496    48.55\n",
       "scite2               43776    39.00\n",
       "nperps               21234    18.92\n",
       "corp1                14327    12.76\n",
       "nwoundte             13272    11.82\n",
       "nperpcap             12624    11.25\n",
       "nkillter             11498    10.24\n",
       "weapsubtype1_txt     10304     9.18\n",
       "weapsubtype1         10304     9.18\n",
       "nwoundus              9642     8.59\n",
       "nkillus               9442     8.41\n",
       "scite1                9325     8.31\n",
       "summary               9263     8.25\n",
       "claimed               9260     8.25\n",
       "nwound                7556     6.73\n",
       "targsubtype1          6779     6.04\n",
       "targsubtype1_txt      6779     6.04\n",
       "nkill                 3776     3.36\n",
       "provstate             2598     2.31\n",
       "longitude             1407     1.25\n",
       "latitude              1407     1.25\n",
       "natlty1               1113     0.99\n",
       "natlty1_txt           1113     0.99\n",
       "city                   446     0.40\n",
       "guncertain1            379     0.34\n",
       "target1                238     0.21\n",
       "specificity              4     0.00\n",
       "ishostkid                3     0.00\n",
       "INT_MISC                 0     0.00\n",
       "INT_IDEO                 0     0.00\n",
       "INT_ANY                  0     0.00\n",
       "INT_LOG                  0     0.00\n",
       "dbsource                 0     0.00\n",
       "iyear                    0     0.00\n",
       "property                 0     0.00\n",
       "weaptype1_txt            0     0.00\n",
       "iday                     0     0.00\n",
       "extended                 0     0.00\n",
       "country                  0     0.00\n",
       "country_txt              0     0.00\n",
       "region                   0     0.00\n",
       "region_txt               0     0.00\n",
       "vicinity                 0     0.00\n",
       "crit1                    0     0.00\n",
       "crit2                    0     0.00\n",
       "crit3                    0     0.00\n",
       "doubtterr                0     0.00\n",
       "multiple                 0     0.00\n",
       "success                  0     0.00\n",
       "suicide                  0     0.00\n",
       "attacktype1              0     0.00\n",
       "attacktype1_txt          0     0.00\n",
       "targtype1                0     0.00\n",
       "targtype1_txt            0     0.00\n",
       "gname                    0     0.00\n",
       "imonth                   0     0.00\n",
       "weaptype1                0     0.00\n",
       "individual               0     0.00"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check the number of missing values in each attribute\n",
    "count = gtd_df.isnull().sum()\n",
    "percent = round(count / 112251 * 100, 2)\n",
    "series = [count, percent]\n",
    "result = pd.concat(series, axis=1, keys=['Count','Percent'])\n",
    "result.sort_values(by='Count', ascending=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Identify the First Pass of Target Attributes\n",
    "Select the list of attributes that contain missing values of less than 20% and that are not duplicated by another attribute."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['iyear', 'imonth', 'iday', 'extended', 'country', 'country_txt',\n",
       "       'region', 'region_txt', 'provstate', 'city', 'latitude',\n",
       "       'longitude', 'specificity', 'vicinity', 'summary', 'crit1', 'crit2',\n",
       "       'crit3', 'doubtterr', 'multiple', 'success', 'suicide',\n",
       "       'attacktype1', 'attacktype1_txt', 'targtype1', 'targtype1_txt',\n",
       "       'targsubtype1', 'targsubtype1_txt', 'corp1', 'target1', 'natlty1',\n",
       "       'natlty1_txt', 'gname', 'guncertain1', 'individual', 'nperpcap',\n",
       "       'claimed', 'weaptype1', 'weaptype1_txt', 'weapsubtype1',\n",
       "       'weapsubtype1_txt', 'nkill', 'nkillus', 'nkillter', 'nwound',\n",
       "       'nwoundus', 'nwoundte', 'property', 'ishostkid', 'scite1',\n",
       "       'dbsource', 'INT_LOG', 'INT_IDEO', 'INT_MISC', 'INT_ANY'], dtype=object)"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_attrs = result[result['Percent'] < 20.0]\n",
    "keep_attrs = target_attrs.index.values\n",
    "\n",
    "# The nperps attribute contain 18.91% blank values.  However, an additional 64.31% are \n",
    "# coded (-99, -9) as unknown.\n",
    "keep_attrs = keep_attrs[keep_attrs != 'nperps']\n",
    "keep_attrs\n",
    "\n",
    "# Remove attributes that duplicate another attribute\n",
    "keep_attrs = keep_attrs[keep_attrs != 'country']\n",
    "keep_attrs = keep_attrs[keep_attrs != 'region']\n",
    "keep_attrs = keep_attrs[keep_attrs != 'attacktype1']\n",
    "keep_attrs = keep_attrs[keep_attrs != 'targtype1']\n",
    "keep_attrs = keep_attrs[keep_attrs != 'targsubtype1']\n",
    "keep_attrs = keep_attrs[keep_attrs != 'natlty1']\n",
    "keep_attrs = keep_attrs[keep_attrs != 'weaptype1']\n",
    "keep_attrs = keep_attrs[keep_attrs != 'weapsubtype1']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Subset the Original Dataset\n",
    "Only include the attributes in the target set of attributes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 112251 entries, 199501000001 to 201701270001\n",
      "Data columns (total 47 columns):\n",
      "iyear               112251 non-null int64\n",
      "imonth              112251 non-null int64\n",
      "iday                112251 non-null int64\n",
      "extended            112251 non-null int64\n",
      "country_txt         112251 non-null object\n",
      "region_txt          112251 non-null object\n",
      "provstate           109653 non-null object\n",
      "city                111805 non-null object\n",
      "latitude            110844 non-null float64\n",
      "longitude           110844 non-null float64\n",
      "specificity         112247 non-null float64\n",
      "vicinity            112251 non-null int64\n",
      "summary             102988 non-null object\n",
      "crit1               112251 non-null int64\n",
      "crit2               112251 non-null int64\n",
      "crit3               112251 non-null int64\n",
      "doubtterr           112251 non-null int64\n",
      "multiple            112251 non-null int64\n",
      "success             112251 non-null int64\n",
      "suicide             112251 non-null int64\n",
      "attacktype1_txt     112251 non-null object\n",
      "targtype1_txt       112251 non-null object\n",
      "targsubtype1_txt    105472 non-null object\n",
      "corp1               97924 non-null object\n",
      "target1             112013 non-null object\n",
      "natlty1_txt         111138 non-null object\n",
      "gname               112251 non-null object\n",
      "guncertain1         111872 non-null float64\n",
      "individual          112251 non-null int64\n",
      "nperpcap            99627 non-null float64\n",
      "claimed             102991 non-null float64\n",
      "weaptype1_txt       112251 non-null object\n",
      "weapsubtype1_txt    101947 non-null object\n",
      "nkill               108475 non-null float64\n",
      "nkillus             102809 non-null float64\n",
      "nkillter            100753 non-null float64\n",
      "nwound              104695 non-null float64\n",
      "nwoundus            102609 non-null float64\n",
      "nwoundte            98979 non-null float64\n",
      "property            112251 non-null int64\n",
      "ishostkid           112248 non-null float64\n",
      "scite1              102926 non-null object\n",
      "dbsource            112251 non-null object\n",
      "INT_LOG             112251 non-null int64\n",
      "INT_IDEO            112251 non-null int64\n",
      "INT_MISC            112251 non-null int64\n",
      "INT_ANY             112251 non-null int64\n",
      "dtypes: float64(13), int64(18), object(16)\n",
      "memory usage: 41.1+ MB\n"
     ]
    }
   ],
   "source": [
    "subset_df = gtd_df.loc[:, keep_attrs]\n",
    "subset_df.info(verbose = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Fix Missing Values\n",
    "The code book is not consistent when classify missing or unknown values. The original data included, blanks, -9, an -99.  For consistency, -1 is used for categorical attributes that are numeric and UNKNOWN is used for categorical attributes that are text.  Numeric attributes that contain coded missing values are replaced with NAN."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Categorical Variables\n",
    "# ---------------------\n",
    "subset_df['specificity'].fillna(-1, inplace=True)\n",
    "\n",
    "subset_df.loc[subset_df['vicinity'] == -9, 'vicinity'] = -1\n",
    "\n",
    "subset_df.loc[subset_df['doubtterr'] == -9, 'doubtterr'] = -1\n",
    "\n",
    "subset_df['targsubtype1_txt'].fillna('UNKNOWN', inplace=True)\n",
    "\n",
    "subset_df['natlty1_txt'].fillna('UNKNOWN', inplace=True)\n",
    "\n",
    "subset_df['guncertain1'].fillna(-1, inplace=True)\n",
    "\n",
    "subset_df['claimed'].fillna(-1, inplace=True)\n",
    "subset_df.loc[subset_df['claimed'] == -9, 'claimed'] = -1\n",
    "\n",
    "subset_df['weapsubtype1_txt'].fillna('UNKNOWN', inplace=True)\n",
    "\n",
    "subset_df.loc[subset_df['property'] == -9, 'property'] = -1\n",
    "\n",
    "subset_df['ishostkid'].fillna(-1, inplace=True)\n",
    "subset_df.loc[subset_df['ishostkid'] == -9, 'ishostkid'] = -1\n",
    "\n",
    "subset_df.loc[subset_df['INT_LOG'] == -9, 'INT_LOG'] = -1\n",
    "\n",
    "subset_df.loc[subset_df['INT_IDEO'] == -9, 'INT_IDEO'] = -1\n",
    "\n",
    "subset_df.loc[subset_df['INT_MISC'] == -9, 'INT_MISC'] = -1\n",
    "\n",
    "subset_df.loc[subset_df['INT_ANY'] == -9, 'INT_ANY'] = -1\n",
    "\n",
    "\n",
    "# Numeric Variables\n",
    "# -----------------\n",
    "subset_df.loc[subset_df['nperpcap'] == -9, 'nperpcap'] = np.nan\n",
    "subset_df.loc[subset_df['nperpcap'] == -99, 'nperpcap'] = np.nan\n",
    "\n",
    "\n",
    "# Text Variables\n",
    "# --------------\n",
    "subset_df['provstate'].fillna('UNKNOWN', inplace=True)\n",
    "subset_df['city'].fillna('UNKNOWN', inplace=True)\n",
    "subset_df.loc[subset_df['city'] == 'Unknown', 'city'] = 'UNKNOWN'\n",
    "subset_df['summary'].fillna('UNKNOWN', inplace=True)\n",
    "subset_df['corp1'].fillna('UNKNOWN', inplace=True)\n",
    "subset_df['target1'].fillna('UNKNOWN', inplace=True)\n",
    "subset_df['scite1'].fillna('UNKNOWN', inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Map Yes/No/Uknown Codes\n",
    "Many attributes contain codes of 1, 0, -1 to represent Yes, No, and Unknown.  Replace the codes with labels to improve exploratory data analysis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 112251 entries, 199501000001 to 201701270001\n",
      "Data columns (total 47 columns):\n",
      "iyear               112251 non-null int64\n",
      "imonth              112251 non-null int64\n",
      "iday                112251 non-null int64\n",
      "country_txt         112251 non-null object\n",
      "region_txt          112251 non-null object\n",
      "provstate           112251 non-null object\n",
      "city                112251 non-null object\n",
      "latitude            110844 non-null float64\n",
      "longitude           110844 non-null float64\n",
      "specificity         112251 non-null float64\n",
      "summary             112251 non-null object\n",
      "attacktype1_txt     112251 non-null object\n",
      "targtype1_txt       112251 non-null object\n",
      "targsubtype1_txt    112251 non-null object\n",
      "corp1               112251 non-null object\n",
      "target1             112251 non-null object\n",
      "natlty1_txt         112251 non-null object\n",
      "gname               112251 non-null object\n",
      "nperpcap            98849 non-null float64\n",
      "weaptype1_txt       112251 non-null object\n",
      "weapsubtype1_txt    112251 non-null object\n",
      "nkill               108475 non-null float64\n",
      "nkillus             102809 non-null float64\n",
      "nkillter            100753 non-null float64\n",
      "nwound              104695 non-null float64\n",
      "nwoundus            102609 non-null float64\n",
      "nwoundte            98979 non-null float64\n",
      "scite1              112251 non-null object\n",
      "dbsource            112251 non-null object\n",
      "extended_txt        112251 non-null object\n",
      "vicinity_txt        112251 non-null object\n",
      "crit1_txt           112251 non-null object\n",
      "crit2_txt           112251 non-null object\n",
      "crit3_txt           112251 non-null object\n",
      "doubtterr_txt       112251 non-null object\n",
      "multiple_txt        112251 non-null object\n",
      "success_txt         112251 non-null object\n",
      "suicide_txt         112251 non-null object\n",
      "guncertain1_txt     112251 non-null object\n",
      "individual_txt      112251 non-null object\n",
      "claimed_txt         112251 non-null object\n",
      "property_txt        112251 non-null object\n",
      "ishostkid_txt       112251 non-null object\n",
      "INT_LOG_txt         112251 non-null object\n",
      "INT_IDEO_txt        112251 non-null object\n",
      "INT_MISC_txt        112251 non-null object\n",
      "INT_ANY_txt         112251 non-null object\n",
      "dtypes: float64(10), int64(3), object(34)\n",
      "memory usage: 41.1+ MB\n"
     ]
    }
   ],
   "source": [
    "# Map the codes to labels\n",
    "ynu_map = {1: 'YES', 0: 'NO', -1: 'UKNOWN'}\n",
    "\n",
    "# List of target attributes to map\n",
    "ynu_attrs =['extended', 'vicinity', 'crit1', 'crit2', 'crit3', 'doubtterr', 'multiple', \n",
    "            'success', 'suicide', 'guncertain1', 'individual', 'claimed', 'property', \n",
    "            'ishostkid', 'INT_LOG', 'INT_IDEO', 'INT_MISC', 'INT_ANY']\n",
    "\n",
    "# Iterate over each target attribute and map it\n",
    "for att in ynu_attrs:\n",
    "    att_txt = att + '_txt'\n",
    "    subset_df[att_txt] = subset_df[att].map(ynu_map)\n",
    "\n",
    "# Get the list of attributes, dropping the coded for labeled attributes\n",
    "final_attrs = []\n",
    "\n",
    "for attr in subset_df.columns.values:\n",
    "    if attr not in ynu_attrs:\n",
    "        final_attrs.append(attr)\n",
    "        \n",
    "subset_df2 = subset_df.loc[:, final_attrs]\n",
    "subset_df2.info(verbose = True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save the Preprocessed Data\n",
    "Output the new data frame to a CSV file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "subset_df2.to_csv(\"../data/gtd_preprocessed_95t016.csv\", sep = \",\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
